Introduction

This is an investigation on the use of Topic Modeling on the course catalog at Florida Polytechnic University.

params$department
## [1] "Computer Science"
library(tidytext)
library(textmineR)
library(broom)
library(tidyr)
library(dplyr)
library(ggplot2)
library(here)
library(stringr)
library(DT)
library(proxy)
library(smacof)
library(ggrepel)
# library(MASS)
library(stringdist)
library(widyr)
library(igraph)
library(ggraph)
ptsize <- 2
legends <- TRUE
dist_lmt <- 3.4
library(readr, quietly = T)

set.seed(543)
source(here("transform_course_data.R"))
data <- read_csv(here("data/courses-list-fpu.csv"))
filter_regex = ""
replace_regex = ""
filtering_string <- regex(paste0("^Week|\\s{2}|^\\s{1}|^Quiz|^Chapter|^Case|^http|Ch.|^Incoterms|Exam|Presentations|www|", 
                                   filter_regex), 
                            ignore_case = TRUE)
replace_string <- regex(paste0("^\\d{1}\\. |\\d{2}\\. |^Lab \\d{1}. |^Lab \\d{2}. |^[a-z]. |^\\d{1}.|", 
                               replace_regex), 
                        ignore_case = TRUE)

data <- clean_columns(data) 


# This will eventually be its own script
outl_df <- data %>% 
  mutate(new_col = strsplit(as.character(Course_Description), "[\\\r\\\n\\\t]+")) 

main_outl_df <- outl_df %>% tidyr::separate_rows(new_col, sep = "^[0-9].")  %>% filter(!grepl("^\\d{1}\\. |\\d{2}\\. ", new_col))
# Getting everything else just in case we need them
side_outl_df <- outl_df %>% tidyr::separate_rows(new_col, sep = "^[0-9].") %>% filter(!grepl(paste0("^\\d{1}\\. |\\d{2}\\. "), new_col)) 

main_outl_df$new_col <- main_outl_df$new_col %>% 
  str_replace_all(replace_string, "")


# Separate 
side_outl_new <- side_outl_df %>% 
  filter(!grepl(filtering_string, 
                new_col, ignore.case = TRUE)) %>% 
  filter(!is.na(new_col))


side_outl_new$new_col <- side_outl_new$new_col %>% str_replace_all(replace_string, "")
# Joining the two dataframes for the new_col
full_outl <- main_outl_df %>% rbind(side_outl_new)

# Filtering to the department
full_outl <- full_outl %>% 
  filter(Department_Name == params$department)
  # filter(Department_Name == "Computer Science")
  # filter(Department_Name == "Data Science and Business Analytics")


# Getting bigrams
terms_bigram <- full_outl %>% 
  select(c(Course_ID, new_col)) %>% 
  unnest_tokens("desc_word", new_col, token = "ngrams", n = 2) %>% 
  separate(desc_word, c("word1", "word2")) %>% 
  filter(!word1 %in% c(stop_words$word, "research", "scientific", "paper", "guest", "topics", "based", "covers", "current", "toolset", "current", "student", "unknown", "senior", "relevant", "term")) %>%
  filter(!grepl("^[0-9]", word1)) %>% 
  filter(!word2 %in% c(stop_words$word, "include", "information", "sources", "project", "term", "base")) %>% 
  filter(!grepl("^[0-9]", word2)) %>% 
  unite(desc_bigram, word1, word2, sep = " ") %>% 
  filter(!desc_bigram == "NA NA")

bigram_dtm <- terms_bigram %>% 
  count(Course_ID, desc_bigram, sort = TRUE) %>% 
  cast_dtm(Course_ID, desc_bigram, n)

# List of course_id matched to course names 
course_list <- split(full_outl$Name, full_outl$Course_ID)

LDA

The first test is an LDA model with k = 5 using the Gibbs method.

library(topicmodels)
# k = 5 for the number of concentrations
bigram_lda <- LDA(bigram_dtm, k = ifelse(params$department == "Computer Science", 6, 5), method = "Gibbs", control=list(iter = 500, verbose = 25, alpha = 0.2))
## K = 6; V = 1066; M = 75
## Sampling 500 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Gibbs sampling completed!

Probabilities

Topic-Term Probabilities (Betas)

course_topics <- tidy(bigram_lda, matrix = "beta")
course_topics %>% 
  datatable()

Document-Topic Probabilities (Gamma)

course_docs <- tidy(bigram_lda, matrix = "gamma")

course_docs %>% 
  group_by(topic) %>%
  slice_max(gamma, n = 5) %>% 
  ungroup() %>%
  arrange(topic, -gamma) %>% 
  datatable()
# course_top_docs %>%
#   mutate(document = reorder_within(document, gamma, topic)) %>%
#   ggplot(aes(gamma, document, fill = factor(document))) +
#   geom_col(show.legend = FALSE) +
#   facet_wrap(~ topic, scales = "free") +
#   scale_y_reordered()

Can it identify the five different Concentrations?

The five concentrations are as follows:
- Logistics & Supply Chain Management
- Intelligent Mobility
- Quantitative Economics and Econometrics
- Big Data Analytics
- Health Systems Engineering

The LDA model we have seems to be able to spread the topics pretty well. But there seems to be a shortcoming in its ability to separate one concentration from one another. I believe this is due to the fact that a lot of the DSBA curriculum overlaps in many ways.

course_top_terms <- course_topics %>% 
  filter(!is.na(term)) %>% 
  group_by(topic) %>%
  slice_max(beta, n = 9) %>% 
  ungroup() %>%
  arrange(topic, -beta)

course_top_terms %>% 
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  
  scale_y_reordered() 

ggsave("img/lda.png", dpi = 300)

Distance metrics

There are some distance metrics I would like to try
- Hellinger Distance (In-progress) - Cosine Similarity (Isn’t this done when using MCA/CA?)
- Jaccard Similary (In-progress) - Sorensen-Dice Similarity (In-progress)

Euclidean Distance

dist_euc <- bigram_dtm %>% 
  tidy() %>% 
  pairwise_dist(item = document, feature = term, value = count, method = "euclidean") 

dist_euc %>%
  filter(distance < dist_lmt) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = distance), show.legend = legends) +
  geom_node_point(color = "lightblue", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()+ 
  labs(title = "Distance Plot: Euclidean")

Manhattan Distance

dist_manh <- bigram_dtm %>% 
  tidy() %>% 
  pairwise_dist(item = document, feature = term, value = count, method = "manhattan") 

dist_manh %>%
  filter(distance < 15) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = distance), show.legend = legends) +
  geom_node_point(color = "lightblue", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() + 
  labs(title = "Distance Plot: Manhattan")

Cosine Distance

sim_cos <- bigram_dtm %>% 
  tidy() %>% 
  pairwise_similarity(item = document, feature = term, value = count)

sim_cos %>% 
  mutate(distance = 1 - similarity) %>% 
  filter(distance < ifelse(params$department == "Computer Science", 0.9, 0.02)) %>% 
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = distance), show.legend = legends) +
  geom_node_point(color = "lightblue", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() + 
  labs(title = "1 - Cosine Similarity Plot: Bigrams")

Burrow’s Delta

delta_brw <- bigram_dtm %>% 
  tidy() %>% 
  pairwise_delta(item = document, feature = term, value = count, method = "burrows")

delta_brw %>% 
  filter(delta < 0.1) %>% 
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = delta), show.legend = legends) +
  geom_node_point(color = "lightblue", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() + 
  labs(title = "Burrows Delta")

Linear Delta

delta_lnr <- bigram_dtm %>% 
  tidy() %>% 
  pairwise_delta(item = document, feature = term, value = count, method = "argamon")

delta_lnr %>% 
  filter(delta < 0.029) %>% 
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = delta), show.legend = legends) +
  geom_node_point(color = "lightblue", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() + 
  labs(title = "Argamon's Linear Delta")

Distance metrics with full course descriptions

This is the same as the graphs above, but using the full course descriptions by: - Getting each word individually
- removing the stop words (a, the, and)
- Rejoining all of the descriptions together
- Computing the distance matrices based on the full descriptions

# Doing what I said above
course_full_desc <- full_outl %>% 
  select(c(Course_ID, new_col)) %>% 
  unnest_tokens("word", new_col) %>% 
  filter(!word %in% stop_words$word) %>% 
  filter(!grepl("[0-9]", word)) %>% 
  group_by(Course_ID) %>% 
  summarise(text = str_c(word, collapse = " ")) %>% 
  ungroup() %>% 
  filter(!is.na(text))

Cosine Similarity

cos_mat <- stringdistmatrix(course_full_desc$text, course_full_desc$text, useNames = FALSE, method = "cosine") %>% 
  as.matrix()

colnames(cos_mat) <- course_full_desc$Course_ID
rownames(cos_mat) <- course_full_desc$Course_ID

cos_course <- reshape2::melt(cos_mat)[reshape2::melt(upper.tri(cos_mat))$value,]

colnames(cos_course) <- c("Term1", "Term2", "distance")


cos_course %>% 
  filter(distance < 0.02) %>% 
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = distance), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() + 
  labs(title = "1 - Cosine Similarity Plot: Full Desc.")

ggsave("img/cos.png", dpi = 300)

Jaccard Similarity

jac_mat <- stringdistmatrix(course_full_desc$text, course_full_desc$text, useNames = FALSE, method = "jaccard") %>% 
  as.matrix()

colnames(jac_mat) <- course_full_desc$Course_ID
rownames(jac_mat) <- course_full_desc$Course_ID

jac_course <- reshape2::melt(jac_mat)[reshape2::melt(upper.tri(jac_mat))$value,]

colnames(jac_course) <- c("Term1", "Term2", "distance")


jac_course %>% 
  filter(distance < 0.04) %>% 
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = distance), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() + 
  labs(title = "1 - Jaccard Similarity Plot: Full Desc.")

ggsave("img/jac.png", dpi = 300)

MDS

All MDS implementations are nonmetric (for ordinal data).

MDS with Euclidean Distance

library(plotly)
mds_euc <- bigram_dtm %>% 
  stats::dist(method = "euclidean") %>% 
  # t() %>% 
  mds(type = "ordinal")

ggplot() +
  geom_point(data = as_tibble(mds_euc$conf), aes(x = D1, y = D2)) +
  geom_text(as_tibble(mds_euc$conf), mapping = aes(x = -D1,y= -D2), label = rownames(bigram_dtm)) +
  theme_minimal() +
  labs(title = "MDS with Euclidean Distance Matrix")

MDS with Manhattan Distance

mds_man <- bigram_dtm %>% 
  stats::dist(method = "manhattan") %>% 
  # t() %>% 
  mds(type = "ordinal")

ggplot() +
  geom_point(data = as_tibble(mds_man$conf), aes(x = D1, y = D2)) +
  geom_text(as_tibble(mds_man$conf), mapping = aes(x = -D1,y= -D2), label = rownames(bigram_dtm)) +
  theme_minimal() +
  geom_text_repel() +
  labs(title = "MDS with Manhattan Distance Matrix")

MDS with 1 - Cosine Similarity

library(slam)
cosine_dist_mat <- 1 - crossprod_simple_triplet_matrix(t(bigram_dtm))/(sqrt(col_sums(t(bigram_dtm)^2) %*% t(col_sums(t(bigram_dtm)^2))))

mds_cos <- cosine_dist_mat %>% 
  # t() %>% 
  mds(type = "ordinal")


ggplot() +
  geom_point(data = as_tibble(mds_cos$conf), aes(x = D1, y = D2)) +
  geom_text(as_tibble(mds_cos$conf), mapping = aes(x = -D1,y= -D2), label = rownames(bigram_dtm)) +
  geom_text_repel() +
  theme_minimal() +
  labs(title = "MDS with 1 - Cosine Similarity")

MDS with 1 - Jaccard Similarity

Still trying to figure this one out

# mds_jac <- bigram_dtm %>% 
#   dist(method = "Jaccard", pairwise = TRUE) %>% 
#   # t() %>% 
#   mds(type = "ordinal")
# 
# 
# ggplot() +
#   geom_point(data = as_tibble(mds_jac$conf), aes(x = D1, y = D2)) +
#   geom_text(as_tibble(mds_jac$conf), mapping = aes(x = -D1,y= -D2), label = rownames(bigram_dtm)) +
#   geom_text_repel() +
#   theme_minimal() +
#   labs(title = "MDS with 1 - Jaccard Similarity")

MDS with Course Descriptions

MDS with 1 - Cosine Similarity

mds_cos_mat <- cos_mat %>% 
  mds(type = "ordinal")

ggplot() +
  geom_point(data = as_tibble(mds_cos_mat$conf), aes(x = D1, y = D2, colour = D2 > 0.5)) +
  scale_colour_manual(values = setNames(c('#532d8e','grey'),c(T, F))) +
  scale_alpha_manual(values = c(1, 0.01)) +
  geom_text(as_tibble(mds_cos_mat$conf), mapping = aes(
    x = -D1, y = -D2, color = D2 < -0.5, label = paste(rownames(mds_cos_mat$conf))), alpha = .7) +
  geom_text_repel() +
  theme_minimal() +
  labs(title = "MDS with 1 - Cosine Similarity") +
  theme(legend.position = "")

ggsave("img/cos_mds.png", dpi = 300) 

MDS with 1 - Jaccard Similarity

mds_jac_mat <- jac_mat %>% 
  mds(type = "ordinal")

ggplot() +
  geom_point(data = as_tibble(mds_jac_mat$conf), aes(x = D1, y = D2, colour = D2 > 0.5 | D2 < -0.55)) +
  scale_colour_manual(values = setNames(c('#532d8e','grey'),c(T, F))) +
  scale_alpha_manual(values = c(1, 0.01)) +
  geom_text(as_tibble(mds_jac_mat$conf), mapping = aes(
    x = -D1, y = -D2, color = D2 < -0.5  | D2 > 0.55, label = paste(rownames(mds_jac_mat$conf))), alpha = .7) +
  geom_text_repel() +
  theme_minimal() +
  labs(title = "MDS with 1 - Jaccard Similarity") +
  theme(legend.position = "")

ggsave("img/jac_mds.png", dpi = 300)